import random
import time
import pymysql
import requests #导入调用网址的库
from bs4 import BeautifulSoup #导入解析库
from fake_useragent import UserAgent


movie_urls =[
    'https://movie.douban.com/subject/1291546/','https://movie.douban.com/subject/1292720/','https://movie.douban.com/subject/1292722/','https://movie.douban.com/subject/1295644/','https://movie.douban.com/subject/1292063/',
    'https://movie.douban.com/subject/1291561/','https://movie.douban.com/subject/1295124/','https://movie.douban.com/subject/3541415/','https://movie.douban.com/subject/3011091/','https://movie.douban.com/subject/1889243/',
    'https://movie.douban.com/subject/1292064/', 'https://movie.douban.com/subject/1292001/', 'https://movie.douban.com/subject/3793023/', 'https://movie.douban.com/subject/2131459/', 'https://movie.douban.com/subject/1291549/',
    'https://movie.douban.com/subject/1307914/', 'https://movie.douban.com/subject/25662329/', 'https://movie.douban.com/subject/1292213/', 'https://movie.douban.com/subject/5912992/', 'https://movie.douban.com/subject/1291841/',
    'https://movie.douban.com/subject/1849031/', 'https://movie.douban.com/subject/1296141/', 'https://movie.douban.com/subject/3319755/', 'https://movie.douban.com/subject/1291560/', 'https://movie.douban.com/subject/6786002/',
    'https://movie.douban.com/subject/1293172/', 'https://movie.douban.com/subject/20495023/', 'https://movie.douban.com/subject/1851857/', 'https://movie.douban.com/subject/1292365/', 'https://movie.douban.com/subject/1295038/',
    'https://movie.douban.com/subject/1291552/', 'https://movie.douban.com/subject/1300267/', 'https://movie.douban.com/subject/21937452/', 'https://movie.douban.com/subject/2129039/', 'https://movie.douban.com/subject/2129039/',
    'https://movie.douban.com/subject/30170448/', 'https://movie.douban.com/subject/26752088/', 'https://movie.douban.com/subject/1293182/', 'https://movie.douban.com/subject/1308807/', 'https://movie.douban.com/subject/1929463/',
    'https://movie.douban.com/subject/1291858/', 'https://movie.douban.com/subject/1299398/', 'https://movie.douban.com/subject/1291583/', 'https://movie.douban.com/subject/1305487/', 'https://movie.douban.com/subject/1291828/',
    'https://movie.douban.com/subject/1291572/', 'https://movie.douban.com/subject/1298624/', 'https://movie.douban.com/subject/1293839/', 'https://movie.douban.com/subject/1296736/', 'https://movie.douban.com/subject/3742360/',
]

movie_name = ['霸王别姬', '阿甘正传', '泰坦尼克号', '这个杀手不太冷', '美丽人生', '千与千寻', '辛德勒的名单', '盗梦空间', '忠犬八公的故事', '星际穿越',
              '楚门的世界', '海上钢琴师', '三傻大闹宝莱坞', '机器人总动员', '放牛班的春天', '无间道', '疯狂动物城', '大话西游之大圣娶亲', '熔炉', '教父',
              '当幸福来敲门', '控方证人', '怦然心动', '龙猫', '触不可及', '末代皇帝', '寻梦环游记', '蝙蝠侠：黑暗骑士', '活着', '哈利波特与魔法石'
              '指环王3：王者无敌', '乱世佳人', '素媛', '飞屋环游记', '摔跤吧！爸爸', '何以为家', '我不是药神', '十二怒汉', '哈尔的移动城堡', '少年派的奇幻漂流',
              '鬼子来了', '大话西游之月光宝盒', '天空之城', '猫鼠游戏', '天堂电影院', '指环王2：双塔奇兵', '闻香识女人', '罗马假日', '钢琴家', '让子弹飞']

# 通过电影的链接获取对应第i页的影评链接
def get_com_url(movie_url,i): #这里i小于等于10页
    print("bbb")
    com_url = movie_url + 'comments?start={}&limit=20&status=P&sort=new_score'.format(i * 20) #里面status=P是看过，=F是想看
    return com_url

# 影评链接,m是写入电影的名字
def get_com(com_url, m):
    print("aaa")
    com_url = com_url
    cookies = {"cookie":'_ga=GA1.2.455762092.1647225481; _gid=GA1.2.1721074005.1647225481; _gcl_au=1.1.166519140.1647225481; Hm_lvt_03b99a010229760348f0c1d8982da2fa=1647225482; Hm_lpvt_03b99a010229760348f0c1d8982da2fa=1647225482; iid={A04BAC45-56BB-579F-2A37-C0DB82388325}; Hm_lvt_838efd399085078302a09c758755e893=1647225483; Hm_lpvt_838efd399085078302a09c758755e893=1647225483'}
    headers = {
        'User-Agent': str(UserAgent().random)}  # 模拟浏览器
    r = requests.get(com_url, headers=headers,cookies=cookies)
    # print(r.text)
    bs = BeautifulSoup(r.text, "lxml")
    print("ccc")

    username = [i.find("a").text for i in bs.find_all("span", "comment-info")]  #用户名
    comment_time = [i.text.strip() for i in bs.find_all("span", "comment-time")] #评论时间
    votes = [i.text for i in bs.find_all("span", "votes")] #有用数
    short = [i.text for i in bs.find_all("span", "short")] #评论内容
    print(len(username), len(comment_time), len(votes), len(short))

    # 这里把每个获取到的元素成组放进数据库
    for j in range(19):
        value = (movie_name[m], username[j], comment_time[j], votes[j], short[j])
        insert_cominfo(value)
        time.sleep(0.5)

# 插入数据进入数据库
def insert_cominfo(value):
    # 连接数据库
    db = pymysql.Connect(host='192.168.189.10', user='demo', password='FkSeNfbABEAWpP47', port=3306, db='demo')
    cursor = db.cursor()
    # 插入语句
    sql = "INSERT INTO aftercomment(moviename, user_name, com_time, use_num, comment) values(%s, %s, %s, %s, %s)"
    # 表名：aftercomment 列名：moviename, user_name, com_time, use_num,comment

    try:
        cursor.execute(sql, value)
        db.commit()
        print('插入数据成功')
    except Exception as e:
        db.rollback()
        print("执行MySQL: %s 时出错：%s" % (sql, e))
    db.close()

for i in range(50):
    print("开始爬取第{}个电影".format(i+1))
    for j in range(11):
        print("开始爬取第{}页".format(j+1))
        com_url = get_com_url(movie_urls[i], j)
        get_com(com_url, i)
        time.sleep(random.randint(2, 5))

    time.sleep(5)
